library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5 ✓ purrr 0.3.4
## ✓ tibble 3.1.6 ✓ dplyr 1.0.7
## ✓ tidyr 1.1.4 ✓ stringr 1.4.0
## ✓ readr 2.1.1 ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(janitor)
##
## Attaching package: 'janitor'
## The following objects are masked from 'package:stats':
##
## chisq.test, fisher.test
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
library(skimr)
library(knitr)
library(naniar)
##
## Attaching package: 'naniar'
## The following object is masked from 'package:skimr':
##
## n_complete
library(GGally)
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
library(styler)
library(ggtext)
library(data.table)
##
## Attaching package: 'data.table'
## The following objects are masked from 'package:lubridate':
##
## hour, isoweek, mday, minute, month, quarter, second, wday, week,
## yday, year
## The following objects are masked from 'package:dplyr':
##
## between, first, last
## The following object is masked from 'package:purrr':
##
## transpose
library(grid)
library(timeDate)
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
Data has been gathered from Kaggle. The data talks about the New York Times articles in the year 2020. The unique identifier in terms of data is Headline and Abstract which talk about the Headline about the article. Then there are also columns that tell us the which department was the news worked on and which sections did it appear in the most. The aim is to analyze popularity of articles. For that, there is a column of number of comments.
# Reading in data set
nyt_articles <- read_csv("../data/nyt-articles-2020.csv")
## Rows: 16787 Columns: 11
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (8): newsdesk, section, subsection, material, headline, abstract, keywo...
## dbl (2): word_count, n_comments
## dttm (1): pub_date
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
First thing we do is we try to check the number of missing values for every variable in our dataset
# checking missing values
nyt_articles |>
miss_var_summary() |>
ggplot(aes(x = stats::reorder(variable, n_miss),
y = n_miss)) +
geom_bar(stat = "identity",
position = "dodge",
width = 0.05,
show.legend = FALSE) +
geom_point() +
coord_flip() +
theme_classic()+
labs(
x = "Variables",
y = "Count of missing values",
title = paste0("**Initial plot of *missing* values**"),
# caption = "<span style='font-size:8pt'>Data by:
# <span style='color:#756bb1;'>Dr. Katsuhiko Takabayashi</span> <br> Graph by:
# <span style='color:#756bb1;'>Avi Arora</span>
# </span>"
)+
theme(
plot.title = element_markdown(lineheight = 1.5),
plot.caption = element_markdown(lineheight = 1.1))
We can see that Subsection has 67% values as missing values. Hence we remove that coulmn from our analysis
# Since Subsection is almost empty, we remove it
nyt_articles <- nyt_articles |> select(-subsection)
nyt_articles <- nyt_articles |> mutate(text = paste(headline , "." , abstract))
duplicate_articles <- nyt_articles[duplicated(nyt_articles$text),]
To judge whether an article is popular or not, we use the column number of comments. The idea is that more the comments, more famous the article is.
Problem - One biggest problem in defining popularity stems from how the data has been collected. Since the number of comments are the comments on a specific article for that month, the later the day is in month, the less number of articles.
Solution - There are multiple ways to deal with this problem and none of them is perfect. The way we are trying to resolve this is by defining the threshold for popularity based on the week number of the month. So, the threshold for popularity will be different for week 1 and different for week 4 in a month, giving week 4 some breathing space in terms of number of comments.
# Getting hour of the day
nyt_articles <- nyt_articles |> mutate(hour = format(pub_date,"%H"))
nyt_articles$hour <- as.numeric(nyt_articles$hour)
# Getting month
nyt_articles <- nyt_articles |> mutate(month = format(pub_date,"%m"))
nyt_articles$month <- as.numeric(nyt_articles$month)
# Getting day
nyt_articles <- nyt_articles |> mutate(day = format(pub_date,"%d"))
nyt_articles$day <- as.numeric(nyt_articles$day)
x <- nyt_articles |> group_by(day,month) |>
summarise(total_comments = sum(n_comments)) |>
ggplot()+
geom_bar(mapping = aes(
x = day,
y = total_comments,
frames = month
),
stat = "identity",
position = "dodge")
## `summarise()` has grouped output by 'day'. You can override using the `.groups` argument.
## Warning: Ignoring unknown aesthetics: frames
ggplotly(x)
# Getting week of month
nyt_articles <- nyt_articles |> mutate(week_count = case_when(
day <=8 ~ 1,
day >8 & day<=16 ~ 2,
day >16 & day <= 23 ~ 3,
day > 23 ~ 4
))
check <- nyt_articles |> filter(week_count == 4)
median(check$n_comments)
## [1] 85
check <- nyt_articles |> filter(week_count == 3)
median(check$n_comments)
## [1] 85.5
check <- nyt_articles |> filter(week_count == 2)
median(check$n_comments)
## [1] 89
check <- nyt_articles |> filter(week_count == 1)
median(check$n_comments)
## [1] 88
We can see that the median is almost the same for every week number, so we can take a common threshold for all. We have chosen 100 as a threshold for the popularity.
# Since the median comes out to be 87, we chose 100 as the cut off number
nyt_articles <- nyt_articles |> mutate(is_popular = case_when(
n_comments >= 100 ~ "Yes",
TRUE ~ "No"
))
We try to check if the headline has a question mark or not. A question mark in headline could be seen as an invitation to provide commentery and thus inducing more comments.
# Check if the headline has a question or not
# In general, question marks could seem like an invitation to comment
nyt_articles <- nyt_articles |> mutate(contains_question = case_when(
grepl("\\?",headline) ~ "Yes",
TRUE ~ "No"
))
# Generating if article was published on weekend or weekday
nyt_articles <- nyt_articles |> mutate(is_weekend <- isWeekend(pub_date))